# RECITublet R script v1.0 (last updated 07/07/2021)
# Author: Rachael Bashford-Rogers (Wellcome Centre for Human Genetics, University of Oxford, rbr1@well.ox.ac.uk)
# This script will identify scRNA-seq droplets that have features of doublets/multiplets based on CITE-seq information
#### identifying VDJ doublets/multiplets
#### load requirements
library(Seurat)
library(harmony)
library(ggplot2)
concat = function(v) {
res = ""
for (i in 1:length(v)){res = paste(res,v[i],sep="")}
res
}
########################################################
#### load in object with gene expression and ADT data
#### example is run of 3 healthy PBMC samples (from 10X website)
batch = "CC2" #### batch name for all outputs
pbmc = readRDS(file=concat(c("Seurat_object_CITE_CC2.seurat"))) #### downloadable from: https://www.dropbox.com/s/tuwamdw4zz5mswb/Seurat_object_CITE_CC2.seurat?dl=0
### GEX QC (standard)
QC_GEX<-function(object, prefix_figures){
mito.genes <- grep(pattern = "^MT-", x = rownames(x = object@assays$ RNA@ counts), value = TRUE)
length(mito.genes)
percent.mito <- Matrix::colSums(object@assays$ RNA@ counts[mito.genes, ]) / Matrix::colSums(object@assays$ RNA@ counts)
object <- AddMetaData(object = object,metadata = percent.mito,col.name = "percent.mito")
matrix = object@assays$ RNA@ counts
mitrib <- grep(pattern = "^MT-|^RP[SL]", x = rownames(x = matrix), value = TRUE, ignore.case = TRUE)
mito <- grep(pattern = "^MT-", x = rownames(x = matrix), value = TRUE, ignore.case = TRUE)
mitribcounts<- matrix[which(rownames(matrix) %in% mitrib), ]
mitoribo_ratio <- Matrix::colSums(mitribcounts[mito, , drop = FALSE])/Matrix::colSums(mitribcounts)
object <- AddMetaData(object = object, metadata = as.data.frame(mitoribo_ratio) ,col.name = "mito.ribo_ratio")
fileout1=concat(c(prefix_figures,"1.pdf"))
w=5
pdf(file=fileout1, height=w*3, width=w*5)
par(mfrow= c(1,1), mar = c(8,4,4,4))
VlnPlot(object,features = c("nFeature_RNA", "nCount_RNA","nCount_ADT", "percent.mito"),ncol = 1, group.by = "orig.ident" )
dev.off()
object <- subset(object, subset = nFeature_RNA > 500 & nCount_RNA > 1000 & percent.mito < 0.20)
table(object@ meta.data$orig.ident)
object <- NormalizeData(object)
object <- FindVariableFeatures(object, selection.method = "vst", nfeatures = 2000)
object <- ScaleData(object = object)
return(object)
}
prefix_figures = concat(c("Seurat_filtering_", batch,"_"))
pbmc = QC_GEX(pbmc, prefix_figures)
######### batch correct samples and UMAP
Batch_corrections_and_dimensionality_reduction<-function(object, prefix_figures){
object <- RunPCA(object, verbose = FALSE)
object <- RunHarmony(object, c("orig.ident"),plot_convergence = TRUE, nclust = 60, max.iter.cluster = 100, max.iter.harmony = 10)
library(cowplot)
fileout1=concat(c(prefix_figures,"2.pdf"))
w=6
pdf(file=fileout1, height=w*1, width=w*3)
par(mfrow= c(1,1), mar = c(4,4,4,4))
p1 <- DimPlot(object = object, reduction = "harmony", pt.size = .1, group.by = "orig.ident", do.return = TRUE)
p2 <- VlnPlot(object = object, features = "harmony_1", group.by = "orig.ident", pt.size = .1)
plot_grid(p1,p2)
dev.off()
object = RunUMAP(object ,reduction = "harmony", dims = 1:20)
object = FindNeighbors(object ,reduction = "umap", dims = 1:2)
object = FindClusters(object,reduction = "umap" ,resolution = 0.1)
fileout1=concat(c(prefix_figures,"3.pdf"))
w=6
pdf(file=fileout1, height=w*1, width=w*2.6)
par(mfrow= c(1,1), mar = c(4,4,4,4))
p1 <- DimPlot(object = object, reduction = "umap", pt.size = .1, group.by = "orig.ident", do.return = TRUE)
p2 <- DimPlot(object = object, reduction = "umap", group.by = "seurat_clusters", pt.size = .1, do.return = TRUE)
plot_grid(p1,p2)
dev.off()
return(object)
}
pbmc = Batch_corrections_and_dimensionality_reduction(pbmc, prefix_figures)
##### load in labels for cell types
cell_type = readRDS(file=concat(c("Seurat_harmonised_pre_predicate_CITE_CC1.all_cell_types"))) #### downloadable from: https://www.dropbox.com/s/uswu0jdnbgrycof/Seurat_harmonised_pre_predicate_CITE_CC1.all_cell_types?dl=0
cell_id = rownames(pbmc@meta.data)
cell_id_ct = names(cell_type)
samples = sort(unique(pbmc@meta.data$orig.ident))
for(i in c(1:length(samples))){
cell_id_ct = gsub(concat(c("||", samples[i])), concat(c("_",i)), cell_id_ct, fixed = T)
}
length(intersect(cell_id, cell_id_ct))
length(setdiff(cell_id ,cell_id_ct))
cell_type_object = rep("-", length(cell_id))
names(cell_type_object) = cell_id
cell_type_object[cell_id_ct] = cell_type
pbmc@meta.data$cell_type= cell_type_object[cell_id]
fileout1=concat(c(prefix_figures,"4.pdf"))
w=6
pdf(file=fileout1, height=w*1, width=w*2.1)
par(mfrow= c(1,1), mar = c(4,4,4,4))
p1 <- DimPlot(object = pbmc, reduction = "umap", group.by = "cell_type", pt.size = .1, do.return = TRUE)
plot_grid(p1)
dev.off()
saveRDS(file=concat(c("Seurat_object_", batch,".seurat_post_filtering")), pbmc)
########################################################
######### normalise ADT
pbmc = readRDS(file=concat(c("Seurat_object_", batch,".seurat_post_filtering")))
Normalise_ADT<-function(object){
object <- NormalizeData(object, assay = "ADT", normalization.method = "CLR")
object <- ScaleData(object, assay = "ADT")
return(object)}
pbmc=Normalise_ADT(pbmc)
############# CITE-seq - gene matching files
CITE_seq = rownames(pbmc@assays$ ADT@counts)
ref_file = "CITE_Seq_gene_name.txt" #### downloadable from: https://www.dropbox.com/s/rqdazl5lhrtqwgf/CITE_Seq_gene_name.txt?dl=0
### file containing the mapping of CITE-seq names with gene/protein names
p <- as.matrix(read.csv(ref_file, head=T, sep="\t"))
gene_CITE =p[match(CITE_seq, p[,1]),2]
names(CITE_seq) = gene_CITE
CITE_seq_protein = p[,3]
names(CITE_seq_protein) = p[,1]
gene_CITEs = sort(unique(gene_CITE))
orig.ident = pbmc@meta.data$orig.ident
orig.idents = sort(unique(orig.ident))
CITE_data_all =pbmc@assays$ ADT@scale.data
GEX_data_all =pbmc@assays$ RNA@counts
############# functions: threshold CITE-seq data to determine which cells are expressing each marker by taking the best CITE-seq divider between GEX postivie and GEX negative.
Plot_CITE_seq_versus_gene_expression<-function(gene_CITE, gene_CITEs, CITE_data_all,orig.ident, orig.idents, CITE_seq, GEX_data_all, prefix_figures){
for(i in c(1:length(CITE_seq))){
fileout1=concat(c(prefix_figures, CITE_seq[i],".pdf"))
library(RColorBrewer)
cols = add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5)
w=3.25
pdf(file=fileout1, height=w*5, width=w*6)
par(mfrow= c(5,6), mar = c(5,5,3,3))
for(s in c(1:length(orig.idents))){
w = which(orig.ident== orig.idents[s])
x = CITE_data_all [CITE_seq[i], w]
y = GEX_data_all[gene_CITE[i], w]
w1 = intersect(which(is.na(x)==F),which(is.na(y)==F))
x = x[w1]
y = y[w1]
if(length(x)>100){
print (length(x))
plot (y, x, col = cols[1], pch = 21, bg = cols[1],main = concat(c(orig.idents[s],"\n", CITE_seq[i])), xlab = gene_CITE[i], ylab = CITE_seq[i])
}}
dev.off()
}
}
Get_mean_gene_expression<-function(gene_CITE, cluster, gene_CITEs, GEX_data_all){
clusters = sort(unique(cluster))
m_mean_expression = t(matrix(data = 0,nrow = length(clusters),ncol = length(gene_CITEs), dimnames = c(list(clusters),list(gene_CITEs))))
m_quantile_expression = t(matrix(data = 0,nrow = length(clusters),ncol = length(gene_CITEs), dimnames = c(list(clusters),list(gene_CITEs))))
for(c in c(1:length(clusters))){
w = which(cluster== clusters[c])
m_mean_expression[,clusters[c]] = apply(GEX_data_all[gene_CITEs,w], 1, mean)
m_quantile_expression[,clusters[c]] = apply(GEX_data_all[gene_CITEs,w], 1, function(x){quantile(x,prob = 0.9)})
print(c) }
CITE_gene_expression = c(list(m_mean_expression), list(m_quantile_expression))
names(CITE_gene_expression) = c("mean", "90th quantile")
return(CITE_gene_expression)
}
CITE_seq_level_per_sample<-function(orig.ident, orig.idents, CITE_seq, cluster, CITE_data_all){
clusters = sort(unique(cluster))
m_CITE_expression_by_sample = t(matrix(data = 0,nrow = length(orig.idents),ncol = length(CITE_seq), dimnames = c(list(orig.idents),list(CITE_seq))))
m_CITE_expression_by_cluster = t(matrix(data = 0,nrow = length(clusters),ncol = length(CITE_seq), dimnames = c(list(clusters),list(CITE_seq))))
non_na = NULL
for(i in c(1:length(CITE_seq))){
non_na = c(non_na, list(which(is.na(CITE_data_all[CITE_seq[i],])==F)))}
names(non_na) = CITE_seq
for(c in c(1:length(orig.idents))){
w = which(orig.ident== orig.idents[c])
print (c)
for(i in c(1:length(CITE_seq))){
w1 = intersect(non_na[[i]],w )
m_CITE_expression_by_sample[CITE_seq[i], orig.idents[c]] = mean(CITE_data_all[CITE_seq[i],w1])
}}
for(c in c(1:length(clusters))){
print(c)
w = which(cluster== clusters[c])
for(i in c(1:length(CITE_seq))){
w1 = intersect(non_na[[i]],w )
m_CITE_expression_by_cluster[CITE_seq[i],clusters[c]] = mean(CITE_data_all[CITE_seq[i],w1])
}}
CITE_seq_expression = c(list(m_CITE_expression_by_sample), list(m_CITE_expression_by_cluster))
names(CITE_seq_expression) = c("m_CITE_expression_by_sample", "m_CITE_expression_by_cluster")
return(CITE_seq_expression)
}
Threshold_CITE_seq_positivity<-function(orig.idents, orig.ident, CITE_seq, m_CITE_expression_by_sample, CITE_data_all, CITE_gene_expression, cell_ids){
library(MASS)
thresholds = t(matrix(data = 0,nrow = length(orig.idents),ncol = length(CITE_seq), dimnames = c(list(orig.idents),list(CITE_seq))))
prediction_accuracy = t(matrix(data = 0,nrow = length(orig.idents),ncol = length(CITE_seq), dimnames = c(list(orig.idents),list(CITE_seq))))
pos_accuracy = t(matrix(data = 0,nrow = length(orig.idents),ncol = length(CITE_seq), dimnames = c(list(orig.idents),list(CITE_seq))))
neg_accuracy = t(matrix(data = 0,nrow = length(orig.idents),ncol = length(CITE_seq), dimnames = c(list(orig.idents),list(CITE_seq))))
cell_ids_CITE_GEX = intersect(cell_ids, colnames(CITE_data_all))
for(i in c(1:length(CITE_seq))){
sample_use = names(which(CITE_seq_expression$m_CITE_expression_by_sample[CITE_seq[i],]!=0))
quantile = CITE_gene_expression $ '90th quantile'[names(CITE_seq)[i],]
quantile = quantile[which(is.na(quantile)==F)]
mean = CITE_seq_expression$m_CITE_expression_by_cluster[CITE_seq[i],]
mean = mean[which(is.na(mean)==F)]
pos = intersect(names(which(quantile>= quantile(quantile, 0.5))), names(which(mean >= quantile(mean, 0.5))))
neg = setdiff(names(which(quantile <= quantile(quantile, 0.1))), pos)
w_use_na = names(which(is.na(CITE_data_all[CITE_seq[i],])==F))
# neg = setdiff(clusters, pos)
for(s in c(1:length(sample_use))){
w = cell_ids [which(orig.ident== sample_use[s])]
w1 = intersect(w, cell_ids [which(cluster %in% neg)])
w2 = intersect(w, cell_ids [which(cluster %in% pos)])
w1 = intersect(intersect(w1, cell_ids_CITE_GEX), w_use_na)
w2 = intersect(intersect(w2, cell_ids_CITE_GEX), w_use_na)
if(min(c(length(w1), length(w2)))>5){
x_pos = CITE_data_all[CITE_seq[i],w2]
x_neg = CITE_data_all[CITE_seq[i],w1]
x = c(x_neg, x_pos)
if(length(x)>10){
if(length(unique(x))>2){
x1 = c(x_neg, x_pos)
class = c(rep("neg", length(x_neg)), rep("pos", length(x_pos)))
ids = c(w1,w2)
class1 = data.frame(ids )
class1$class = factor(class)
class1$x = x
model <- lda(class ~x, data = class1)
predictions <- model %>% predict(class1)
if(min(table(predictions $class))>5){
x2 = data.frame(seq(from = max(x[which(predictions $class=="neg")]), to = min(x[which(predictions $class=="pos")]), length = 10000))
colnames(x2) = 'x'
pred = predict(model, newdata = x2)$class
thr = mean(c(min(x2[which(pred=="pos"),]), max(x2[which(pred=="neg"),])))
acc = length(which(predictions$class==class))*100/length(class)
pacc = length(intersect(which(class=="pos"),which(x1>= thr)))*100/length(which(class=="pos"))
nacc = length(intersect(which(class=="neg"),which(x1< thr)))*100/length(which(class=="neg"))
thresholds[CITE_seq[i],sample_use[s]] = thr
prediction_accuracy [CITE_seq[i],sample_use[s]] = acc
pos_accuracy [CITE_seq[i],sample_use[s]] = pacc
neg_accuracy [CITE_seq[i],sample_use[s]] = nacc
print(i)
}}}}
}
}
CITE_seq_positivity_thresholds = c(list(thresholds), list(prediction_accuracy), list(pos_accuracy), list(neg_accuracy))
names(CITE_seq_positivity_thresholds) = c(("thresholds"),("prediction_accuracy"),("pos_accuracy"),("neg_accuracy"))
return(CITE_seq_positivity_thresholds)
}
Choose_CITE_seq_markers<-function(CITE_seq, CITE_seq_positivity_thresholds, Tpos_accuracy, Tneg_accuracy, Tprediction_accuracy){
headers = c("thresholds", "prediction_accuracy", "pos_accuracy", "neg_accuracy")
thrs = matrix(data = 0,nrow = length(CITE_seq),ncol = length(headers), dimnames = c(list(CITE_seq),list(headers)))
for(i in c(1:length(CITE_seq))){
w1 = intersect(which(CITE_seq_positivity_thresholds$thresholds[CITE_seq[i],]!=0), which(is.finite(CITE_seq_positivity_thresholds$thresholds[i,])))
w2 = intersect(which(CITE_seq_positivity_thresholds$prediction_accuracy[i,]!=0), which(is.finite(CITE_seq_positivity_thresholds$prediction_accuracy[i,])))
w3 = intersect(which(CITE_seq_positivity_thresholds$pos_accuracy[i,]!=0), which(is.finite(CITE_seq_positivity_thresholds$pos_accuracy[i,])))
w4 = intersect(which(CITE_seq_positivity_thresholds$neg_accuracy[i,]!=0), which(is.finite(CITE_seq_positivity_thresholds$neg_accuracy[i,])))
thrs[i,] = c(mean(CITE_seq_positivity_thresholds$thresholds[i,w1]), mean(CITE_seq_positivity_thresholds$prediction_accuracy[i,w2]), mean(CITE_seq_positivity_thresholds$pos_accuracy[i,w3]), mean(CITE_seq_positivity_thresholds$neg_accuracy[i,w4]))
}
CITE_seq_use = CITE_seq[intersect (intersect(which(thrs[,'pos_accuracy']> Tpos_accuracy), which(thrs[,'neg_accuracy']> Tneg_accuracy)), intersect(which(is.finite(thrs[,'thresholds'])==T), which(thrs[,'prediction_accuracy']> Tprediction_accuracy)))]
info_CITE_seq_use = thrs [CITE_seq_use,]
CITE_seq_choose = c(list(thrs), list(CITE_seq_use), list(info_CITE_seq_use))
names(CITE_seq_choose) = c("thrs","CITE_seq_use","info_CITE_seq_use")
return(CITE_seq_choose)
}
Plot_CITE_seq_thresholds<-function(CITE_seq_choose, prefix_figures, CITE_seq_positivity_thresholds, CITE_data_all, GEX_data_all){
library(RColorBrewer)
cols = rep(add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5), 100)
w=4
fileout1 = concat(c(prefix_figures,"1.pdf"))
pdf(file=fileout1, height=w*4, width=w*4)
par(mfrow= c(5,5), mar = c(5,5,3,3))
sample_match = match(orig.ident, orig.idents)
for(i in c(1:length(CITE_seq_choose$CITE_seq_use))){
sample_use = names(which(CITE_seq_expression$m_CITE_expression_by_sample[CITE_seq_choose$CITE_seq_use[i],]!=0))
if(length(sample_use)>0){
for(s in c(1:length(sample_use))){
w = which(orig.ident== sample_use[s])
x = GEX_data_all[names(CITE_seq_choose$CITE_seq_use)[i],w]
y = CITE_data_all[(CITE_seq_choose$CITE_seq_use[i]),w]
w1 = intersect(which(is.na(x)==F),which(is.na(y)==F))
x = x[w1]
y = y[w1]
if(length(unique(y))>10){
plot (x, y, col = cols[sample_match[w]], pch = 21, bg = cols[sample_match[w]],main = concat(c(sample_use[s],"\n", CITE_seq_choose$CITE_seq_use[i])),
xlab = names(CITE_seq_choose$CITE_seq_use)[i], ylab = (CITE_seq_choose$CITE_seq_use[i]))
segments(-100, CITE_seq_positivity_thresholds $thresholds[CITE_seq_choose$CITE_seq_use[i], sample_use[s]], 100000, CITE_seq_positivity_thresholds $thresholds[CITE_seq_choose$CITE_seq_use[i], sample_use[s]], lwd = 2, col = "red")
}}
}
print (i)
}
dev.off()
}
Score_CITE_positivity<-function(CITE_seq_choose, CITE_seq_positivity_thresholds, CITE_data_all, cell_ids){
CITE_score = (CITE_data_all*NA)
for(i in c(1:length(CITE_seq_choose$CITE_seq_use))){
sample_use = names(which(CITE_seq_expression$m_CITE_expression_by_sample [CITE_seq_choose$CITE_seq_use[i],]!=0))
if(length(sample_use)>0){
for(s in c(1:length(sample_use))){
w = cell_ids [which(orig.ident== sample_use[s])]
threshold = CITE_seq_positivity_thresholds $thresholds[CITE_seq_choose$CITE_seq_use[i], sample_use[s]]
w1 = intersect(w, names (which(CITE_data_all[CITE_seq_choose$CITE_seq_use[i],]> threshold)))
if(length(w1)>0){
x = CITE_data_all[CITE_seq_choose$CITE_seq_use[i],w1]
#score = rank(x)/length(x)
score = (x-min(x))
score = (score/max(score))+1
CITE_score[CITE_seq_choose$CITE_seq_use[i],w1] = score}}}
print (i)
}
return(CITE_score)}
Plot_CITE_scores<-function(CITE_score, CITE_data_all, CITE_seq_choose, orig.ident, orig.idents, prefix_figures,CITE_seq_expression){
library(RColorBrewer)
cols = add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5)
w=4
fileout1 = concat(c(prefix_figures,"2.txt"))
pdf(file=fileout1, height=w*4, width=w*4)
par(mfrow= c(5,5), mar = c(5,5,3,3))
sample_match = match(orig.ident, orig.idents)
for(i in c(1:length(CITE_seq_choose$CITE_seq_use))){
sample_use = names(which(CITE_seq_expression$m_CITE_expression_by_sample[CITE_seq_choose$CITE_seq_use[i],]!=0))
if(length(sample_use)>0){
for(s in c(1:length(sample_use))){
w = which(orig.ident== sample_use[s])
scores = CITE_score[CITE_seq_choose$CITE_seq_use[i],w]
scaled_expression = CITE_data_all[CITE_seq_choose$CITE_seq_use[i],w]
scores[which(is.na(scores))] = 0
plot (scaled_expression, scores, col = cols[1], pch = 21, bg = cols[1],main = concat(c(sample_use[s],"\n", CITE_seq_choose$CITE_seq_use[i])))
}}
print (i)
}
dev.off()
}
############# run thresholding functions
prefix_figures = concat(c("CITE_expression", batch,"_"))
Plot_CITE_seq_versus_gene_expression(gene_CITE, gene_CITEs, CITE_data_all,orig.ident, orig.idents, CITE_seq, GEX_data_all, prefix_figures)
cell_type = pbmc@meta.data$cell_type
cluster = pbmc@meta.data$seurat_clusters
CITE_seq_expression = CITE_seq_level_per_sample(orig.ident, orig.idents, CITE_seq, cluster= cluster, CITE_data_all)
CITE_gene_expression = Get_mean_gene_expression(gene_CITE, cluster, gene_CITEs, GEX_data_all)
############# predict thresholds of positive CITE seq cells
cell_ids = rownames(pbmc@meta.data)
CITE_seq_positivity_thresholds = Threshold_CITE_seq_positivity(orig.idents, orig.ident, CITE_seq, CITE_seq_expression, CITE_data_all, CITE_gene_expression,cell_ids)
############# choose CITE seq markers that have good predictions
CITE_seq_choose = Choose_CITE_seq_markers(CITE_seq, CITE_seq_positivity_thresholds,Tpos_accuracy = 20, Tneg_accuracy = 30, Tprediction_accuracy = 60)
############# plot CITE-seq thresholds
Plot_CITE_seq_thresholds(CITE_seq_choose, prefix_figures, CITE_seq_positivity_thresholds, CITE_data_all, GEX_data_all)
############# generate CITE-seq score per positive cell and save output
CITE_score = Score_CITE_positivity(CITE_seq_choose, CITE_seq_positivity_thresholds, CITE_data_all, cell_ids)
saveRDS(file=concat(c("CITE_seq_scale.CITE_score")), CITE_score)
############# plot scores versus expression of CITE seq
prefix_figures = concat(c("CITE_expression", batch,"_"))
Plot_CITE_scores(CITE_score, CITE_data_all, CITE_seq_choose, orig.ident, orig.idents, prefix_figures,CITE_seq_expression)
############# identify doublets/multiplets based on mutually exclusive markers
CITE_score = readRDS(file=concat(c("CITE_seq_scale.CITE_score")))
CITE_score = t(CITE_score) [cell_ids,]
CITE_genes = colnames(CITE_score)
names(CITE_genes) = names(CITE_seq)[match(CITE_genes , CITE_genes)]
ref_file = "CITE_seq_mutually_exclusive_expression.txt" ### from dropbox link: https://www.dropbox.com/s/8p7qq9o8qqrk83y/CITE_seq_mutually_exclusive_expression.txt?dl=0
p <- as.matrix(read.csv(ref_file, head=T, sep="\t"))
p=p[which(p[,3] %in% c("Mutually exclusive","Weak")),]
p=p[intersect(which(p[,1] !="CD56"),which(p[,2] !="CD56")), ]
p=p[which(p[,1] %in% CITE_seq_protein ==T),]
p=p[which(p[,2] %in% CITE_seq_protein ==T),]
############# check that all markers are in the CITE-seq list
p[,1][which(p[,1] %in% CITE_seq_protein ==T)]
p[,2][which(p[,2] %in% CITE_seq_protein ==T)]
mutually_exclusive_CITE_seq = NULL
mutually_exclusive_CITE_seq_names = NULL
for(i in c(1:length(p[,1]))){
mutual1 = names(CITE_seq_protein) [which(CITE_seq_protein == p[i,1])]
mutual2 = names(CITE_seq_protein) [which(CITE_seq_protein == p[i,2])]
for(i1 in c(1:length(mutual1))){
for(i2 in c(1:length(mutual2))){
if(concat(c(mutual2[i2],":", mutual1[i1])) %in% mutually_exclusive_CITE_seq_names==F){
mutually_exclusive_CITE_seq = c(mutually_exclusive_CITE_seq, list(c(mutual1[i1], mutual2[i2])))
mutually_exclusive_CITE_seq_names=c(mutually_exclusive_CITE_seq_names, concat(c(mutual1[i1],":", mutual2[i2])))
}}}}
############# normalise the variables that will be used in the predictor
variables = pbmc@meta.data[,c("percent.mito", "nFeature_RNA","nCount_RNA","mito.ribo_ratio","nCount_ADT")]
rownames(variables)= cell_ids
variables_norm = variables
library(compositions)
for(i in c(1:length(variables[1,]))){
x = variables[,i]
names(x) = cell_ids
for(s in c(1:length(orig.idents))){
w = cell_ids[which(orig.ident== orig.idents[s])]
if(length(w)>0){
clr1 = x[w]/mean(x[w])
variables_norm[w,i] = as.numeric(clr1)
print(concat(c(i, " ", orig.idents[s] )))
}}}
metaD = variables_norm
############# functions: identify double positive cells
Boxplot_nUMI_ngene_mt_cells1<-function(doublets_list, fileout1,metaD){
headers = colnames(metaD)
all_cells = rownames(metaD)
library(RColorBrewer)
cols = add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5)
w=2.5
pdf(file=fileout1, height=w*1.6*3, width=w*4*1.8)
par(mfrow= c(3,2), mar = c(18,4.5,3,1))
for(h in c(1:length(headers))){
groups = NULL
for(i in c(1:length(doublets_list))){
x = metaD[doublets_list[[i]], headers[h]]
groups = c(groups, list(metaD[doublets_list[[i]], headers[h]]))}
factors = names(doublets_list)
boxplot(groups, ylab = "", col = cols,names= factors, las = 2, main= headers[h], outline =T, border = "grey",cex.lab = 0.9, lwd = 0.5, cex.main = 0.7)
}
dev.off()
}
Identify_CITE_seq_doublets <-function(fileout1, orig.idents, orig.ident, mutually_exclusive_CITE_seq_names, CITE_genes1, CITE_score, metaD){
library(MASS)
library(RColorBrewer)
cols = add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5)
variables = metaD
variables_names = colnames(variables)
doublets_list_all = list()
for(s in c(1:length(orig.idents))){
w = cell_ids [which(orig.ident== orig.idents[s])]
double_pos = NULL
single_pos = NULL
double_pos_classification = NULL
sample_positive_CITE = names(which(apply(CITE_score[w,],2,function(x){length(which(is.na(x)==F))})>0))
for(cc in c(1:length(mutually_exclusive_CITE_seq))){
cc1 = mutually_exclusive_CITE_seq[[cc]][1]
cc2 = mutually_exclusive_CITE_seq[[cc]][2]
cite_sub = CITE_genes[which(CITE_genes %in%(c(cc1,cc2)))]
# plot_CITE = names(which(rowSums(raw_data[names(cite_sub),w])>0))
plot_CITE = intersect(sample_positive_CITE,(cite_sub))
if(length(plot_CITE)==2){
x = CITE_score[w,plot_CITE[1]]
y = CITE_score[w,plot_CITE[2]]
w1 = w[intersect(which(is.finite(x)),which(is.finite(y)))]
w1 = w1[intersect(which(x[w1]>0.01), which(y[w1]>0.01))]
w2 = w[setdiff(which(is.finite(x)),which(is.finite(y)))]
w3 = w[setdiff(which(is.finite(y)),which(is.finite(x)))]
w23 = sort(unique(c(w2,w3)))
if(min(c(length(w1),length(w23)))>2){
double_pos = c(double_pos, w1)
single_pos = c(single_pos, w23)
double_pos_classification = c(double_pos_classification, rep(mutually_exclusive_CITE_seq_names[cc], length(w1)))
}}
}
single_pos = setdiff(w, double_pos)
doublets_list = NULL
names = mutually_exclusive_CITE_seq
var = variables[,"nCount_RNA"]
names(var) = rownames(variables)
summary = NULL
for(cc in c(1:length(mutually_exclusive_CITE_seq))){
x = var [double_pos[which(double_pos_classification== mutually_exclusive_CITE_seq_names[cc])]]
doublets_list = c(doublets_list, list(names(x)))
if(length(summary)==0){summary = summary(x)
}else{summary = rbind(summary, summary(x))}
}
x = var [single_pos]
summary = rbind(summary, summary(x))
doublets_list = c(doublets_list, list(single_pos))
rownames(summary) = c(mutually_exclusive_CITE_seq_names,"singlets")
names(doublets_list)= c(mutually_exclusive_CITE_seq_names,"singlets")
w = intersect(which(summary[,'Median']<1.05*summary['singlets','Median']), which(rownames(summary)!="singlets"))
w1 = intersect(which(summary[,'Median']<1.05*summary['singlets','Mean']), which(rownames(summary)!="singlets"))
w = sort(unique(c(w,w1)))
exclude_CITE = rownames(summary)[w]
doublets_list[w] = list(NULL)
######################## plot
names(doublets_list) = gsub("_TotalSeqC","",names(doublets_list) )
fileout1 =concat(c("Seurat_CITE_nGene_CITEseq_doublets_", type,"_", batch,"_", orig.idents[s],".pdf"))
Boxplot_nUMI_ngene_mt_cells1(doublets_list, fileout1,metaD)
w = which(double_pos_classification %in% exclude_CITE==F)
doublet_info = c(list(cbind(double_pos[w], double_pos_classification[w])),list(single_pos), list(summary))
names(doublet_info) = c("true_doublets", "singlets","summary_nUMIs")
doublets_list_all = c(doublets_list_all, list(doublet_info))
}
names(doublets_list_all) = orig.idents
return(doublets_list_all)
}
############# identify double positive cells
prefix_figures = concat(c("CITE_expression_", batch,"_"))
fileout1 = concat(c(prefix_figures,"3.txt"))
doublets_list_all =Identify_CITE_seq_doublets(fileout1, orig.idents, orig.ident, mutually_exclusive_CITE_seq_names,CITE_genes1, CITE_score, metaD)
types = NULL
for(i in c(1:length(doublets_list_all))){types = sort(unique(c(types, doublets_list_all[[i]][[1]][,2])))}
list_doublets_CITE = rep(list(c()), length(types))
names(list_doublets_CITE) = types
all_CITE_doublets = NULL
for(i in c(1:length(doublets_list_all))){
if(length(doublets_list_all[[i]][[1]])>0){
all_CITE_doublets = c(all_CITE_doublets, doublets_list_all[[i]][[1]][,1])}
for(t in types){
w = which(doublets_list_all[[i]][[1]][,2]==t)
list_doublets_CITE[[t]] = c(list_doublets_CITE[[t]], doublets_list_all[[i]][[1]][w,1])}}
all_CITE_doublets= sort(unique(all_CITE_doublets))
saveRDS(file=concat(c("CITE_doublets_", batch,".rds")), all_CITE_doublets)
############# plot nGenes, nUMIs and mt% between these methods
Boxplot_nUMI_ngene_mt_doublets<-function(doublets_list, fileout1,metaD){
headers = colnames(metaD)
all_cells = rownames(metaD)
library(RColorBrewer)
cols = add.alpha (brewer.pal(8, "Dark2"), alpha = 0.5)
w=2.5
pdf(file=fileout1, height=w*1.5, width=w*4*0.9)
par(mfrow= c(1,5), mar = c(18,4.5,3,1))
for(h in c(1:length(headers))){
groups = NULL
names = NULL
factors = c(names(doublets_list), "other")
for(i in c(1:length(doublets_list))){
x =metaD[doublets_list[[i]], headers[h]]
if(length(x)>2){
names = c(names, factors[i])
groups = c(groups, list(x))}}
x1 = unlist(groups)
groups= c(groups, list(metaD[setdiff(all_cells, unlist(doublets_list)), headers[h]]))
names = c(names, "other")
x2 = metaD[setdiff(all_cells, unlist(doublets_list)), headers[h]]
pval = wilcox.test(x1,y=x2,alternative = "two.sided")$p.value
pval1 = signif(pval,digits =3)
if(pval<1e-10){pval1 = "<1e-10"}
boxplot(groups, ylab = "", col = cols,names= names, las = 2, main= concat(c(headers[h],"\np-value ",pval1)), outline =T, border = "grey",cex.lab = 0.9, lwd = 0.5, cex.main = 0.7, ylim = c(0,max(unlist(groups))))
}
dev.off()
}
fileout1 = concat(c("CITE_expression_", batch,"_doublets.pdf"))
metaD = variables_norm
names(list_doublets_CITE) = gsub("_TotalSeqC","",names(list_doublets_CITE))
Boxplot_nUMI_ngene_mt_doublets(list_doublets_CITE, fileout1,metaD)
######################## Write output
saveRDS(file="Seurat_CITE_doublets.rds", list_doublets_CITE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.